global functions (convert key to dummy)

source("thesis_functions.R")
key.dummy <- function(x){
  x <- x %>% mutate(key1 = ifelse(x$key == 1, 1, 0),
              key2 = ifelse(x$key == 2, 1, 0),
              key3 = ifelse(x$key == 3, 1, 0),
              key4 = ifelse(x$key == 4, 1, 0),
              key5 = ifelse(x$key == 5, 1, 0),
              key6 = ifelse(x$key == 6, 1, 0),
              key7 = ifelse(x$key == 7, 1, 0),
              key8 = ifelse(x$key == 8, 1, 0),
              key9 = ifelse(x$key == 9, 1, 0),
              key10 = ifelse(x$key == 10, 1, 0),
              key11 = ifelse(x$key == 11, 1, 0)) %>% select(-key)
    
  x
}

#train/valid/test function from data mining class:
partition.3 <- function(data, prop.train, prop.val){
  # select a random sample of size = prop.train % of total records
  selected1 <- sample(1:nrow(data), round(nrow(data)*prop.train), replace = FALSE) 
  # create training data which has prop.train % of total records
  data.train <- data[selected1,]
  # select a random sample of size = prop.val % of the total records
  rest <- setdiff(1:nrow(data), selected1)
  selected2 <- sample(rest, round(nrow(data)*prop.val), replace = FALSE) 
  # create validation data which has prop.val % of total records
  data.val <- data[selected2,]
  # create testing data with the remaining records
  data.test <- data[setdiff(rest, selected2),]
  return(list(data.train=data.train, data.test=data.test, data.val=data.val))
}

#creates plot for optimal cut off value
opt.cut.func <- function(model, data){
  # create a vector for cutoff values
  cutoff <- seq(0, 1, 0.01)

  # create three empty vectors of same length
  sensitivity.vec <- rep(NA, length(cutoff))
  specificity.vec <- rep(NA, length(cutoff))
  ssdiff.vec <- rep(NA, length(cutoff))
  kappa.vec <- rep(NA, length(cutoff))

# For loop.
  for(i in 1:length(cutoff)){
    pred.prob.val <- predict(model, newdata = data, type = "response")
    pred.y.val <- as.factor(ifelse(pred.prob.val > cutoff[i], 1, 0)) 
    # warning messages galore because the probability of a value actually being popular is SO LOW
    c <- confusionMatrix(pred.y.val, data$popular, 
                        positive = "1")
    sensitivity.vec[i] <- c$byClass["Sensitivity"]
    specificity.vec[i] <- c$byClass["Specificity"]
    ssdiff.vec[i] <- abs(sensitivity.vec[i] - specificity.vec[i])
    kappa.vec[i] <- c$overall["Kappa"]
  }
  return(list(cutoff = cutoff, sensitivity.vec = sensitivity.vec, specificity.vec = specificity.vec, ssdiff.vec = ssdiff.vec, kappa.vec = kappa.vec))
}

reg.opt.cut.func <-  function(model, data){
  cutoff <- seq(0, 1, 0.01)

  # create three empty vectors of same length
  sensitivity.vec <- rep(NA, length(cutoff))
  specificity.vec <- rep(NA, length(cutoff))
  ssdiff.vec <- rep(NA, length(cutoff))
  kappa.vec <- rep(NA, length(cutoff))

  # For loop.
  for(i in 1:length(cutoff)){
    pred.prob.val <- predict(model, s=model$bestTune, data, type = "prob")
    pred.y.val <- as.factor(ifelse(pred.prob.val[,2] > cutoff[i], 1, 0)) 
    # warning messages galore because the probability of a value actually being popular is SO LOW
    c <- confusionMatrix(pred.y.val, data$popular, 
                        positive = "1")
    sensitivity.vec[i] <- c$byClass["Sensitivity"]
    specificity.vec[i] <- c$byClass["Specificity"]
    ssdiff.vec[i] <- abs(sensitivity.vec[i] - specificity.vec[i])
    kappa.vec[i] <- c$overall["Kappa"]
  }
  return(list(cutoff = cutoff, sensitivity.vec = sensitivity.vec, specificity.vec = specificity.vec, ssdiff.vec = ssdiff.vec, kappa.vec = kappa.vec))
}

opt.cut.plot <- function(opt.out){
  plot(opt.out$cutoff, opt.out$sensitivity.vec,xlab = "cutoff", type = "l", col = "blue")
  lines(opt.out$cutoff, opt.out$specificity.vec, type = "l", col = "green")
  lines(opt.out$cutoff, opt.out$kappa.vec, type = "l", col = "red")
  legend( x="right", legend=c("Sensitivity","Specificity", "Kappa"),
        col=c("blue","green","red"), lty = 1, lwd=1)
  
}

data :D

data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
               ArtistGender = as.factor(ArtistGender),
               ArtistGen = factor(ArtistGen),
               release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
               key = factor(key, levels = 0:11),
               mode = as.factor(mode),
               time_signature = factor(time_signature, levels = c(1,3,4,5)),
               popular = factor(ifelse(popularity >=50, 1, 0)))

understanding popularity

hist(data$popularity)

summary(data$popularity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   11.00   23.00   25.88   39.00   94.00

heavily skewed right.

The square root transformation makes it more normal. This will help to meet the multiple linear regression assumptions.

hist(data$popularity^0.5)

Classification of

table(data$popular)/nrow(data)
## 
##         0         1 
## 0.8812021 0.1187979

if classifying a song as popular when it's score is greater than 50, only ~12% of the data is considered a popular song.

kpop <- dplyr::select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, time_signature, tempo, valence)

General Assumptions: continuous response: popularity score ranging from 0 - 100. mix of categorical and continuous response. the distribution of the variables are not normal, we will check for normality of error terms where appropriate.

Goal: create model for predicting popularity scores

Multiple Linear Model?

select just audio features

kpop <- select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% filter(mode == 0)%>% select(-mode)
kpop1 <- kpop %>% filter(mode == 1) %>% select(-mode)

### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]

### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]

For Songs with Mode 0

Full Multiple Linear Regression

fit a multiple linear regression model

ml0 <- lm(popularity ~. , data = train0)
summary(ml0)
## 
## Call:
## lm(formula = popularity ~ ., data = train0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.283 -13.161  -1.606  11.489  64.381 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       98.43022    4.99248  19.716  < 2e-16 ***
## duration          -5.41104    0.55905  -9.679  < 2e-16 ***
## acousticness      -3.81930    2.05303  -1.860 0.062924 .  
## danceability      -2.19279    3.28543  -0.667 0.504544    
## energy           -33.97108    3.33505 -10.186  < 2e-16 ***
## instrumentalness -15.67494    4.37625  -3.582 0.000346 ***
## key1              -1.20599    1.49947  -0.804 0.421291    
## key2              -2.72991    1.85598  -1.471 0.141416    
## key3               3.04084    2.07581   1.465 0.143040    
## key4              -4.34609    1.46983  -2.957 0.003129 ** 
## key5              -0.56490    1.42695  -0.396 0.692217    
## key6               1.34681    1.48984   0.904 0.366062    
## key7               0.04123    1.63277   0.025 0.979856    
## key8              -1.57381    1.76361  -0.892 0.372251    
## key9              -0.94547    1.51224  -0.625 0.531872    
## key10             -1.75720    1.55635  -1.129 0.258955    
## key11             -2.41842    1.38950  -1.740 0.081861 .  
## loudness           3.55623    0.20271  17.543  < 2e-16 ***
## speechiness       23.77941    4.74755   5.009 5.75e-07 ***
## tempo             -0.00429    0.01255  -0.342 0.732398    
## valence          -11.88489    1.78863  -6.645 3.51e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.34 on 3483 degrees of freedom
## Multiple R-squared:  0.143,  Adjusted R-squared:  0.138 
## F-statistic: 29.05 on 20 and 3483 DF,  p-value: < 2.2e-16
plot(ml0)

no or little multicollinearity

no autocorrelation

no homoscedasticity.

Try again with tranformation to make popularity normal:(to the squareroot)

ml0.sqrt <- lm(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5))
summary(ml0.sqrt)
## 
## Call:
## lm(formula = popularity ~ ., data = train0 %>% mutate(popularity = popularity^0.5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9651 -1.2163  0.1704  1.3715  5.2110 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      12.8026945  0.5397564  23.719  < 2e-16 ***
## duration         -0.5857021  0.0604408  -9.691  < 2e-16 ***
## acousticness     -0.1418036  0.2219612  -0.639 0.522952    
## danceability     -0.1381154  0.3552010  -0.389 0.697420    
## energy           -3.9085950  0.3605651 -10.840  < 2e-16 ***
## instrumentalness -2.2443174  0.4731338  -4.744 2.18e-06 ***
## key1             -0.1306867  0.1621132  -0.806 0.420214    
## key2             -0.3754834  0.2006575  -1.871 0.061392 .  
## key3              0.1823478  0.2244238   0.813 0.416551    
## key4             -0.5438271  0.1589089  -3.422 0.000628 ***
## key5             -0.1190871  0.1542733  -0.772 0.440212    
## key6              0.1383520  0.1610728   0.859 0.390433    
## key7             -0.0235720  0.1765251  -0.134 0.893779    
## key8             -0.1566102  0.1906708  -0.821 0.411495    
## key9             -0.1520276  0.1634941  -0.930 0.352505    
## key10            -0.2351475  0.1682632  -1.397 0.162353    
## key11            -0.2909129  0.1502246  -1.937 0.052885 .  
## loudness          0.4225379  0.0219162  19.280  < 2e-16 ***
## speechiness       2.1574574  0.5132766   4.203 2.70e-05 ***
## tempo            -0.0007701  0.0013563  -0.568 0.570207    
## valence          -1.1893401  0.1933758  -6.150 8.60e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.874 on 3483 degrees of freedom
## Multiple R-squared:  0.1594, Adjusted R-squared:  0.1546 
## F-statistic: 33.03 on 20 and 3483 DF,  p-value: < 2.2e-16

assumption checking and diagnostics

plot(ml0.sqrt)

prediction on test data

# prediction on test data
yhat.mlr = predict(ml0.sqrt, newdata = test0 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test0$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
  RMSE = RMSE(yhat.mlr, test0$popularity^0.5),
  R2 = R2(yhat.mlr, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.863478 0.1332768

Much improved!

Variable Selection: Stepwise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5),  
                 method = "lmStepAIC", trControl = train_control)) 
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection 
## 
## 3504 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3153, 3152, 3154, 3154, 3154, 3154, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.878512  0.1528544  1.516643
mlr.step_kcv$finalModel 
## 
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness + 
##     key2 + key3 + key4 + key6 + key11 + loudness + speechiness + 
##     valence, data = dat)
## 
## Coefficients:
##      (Intercept)          duration            energy  instrumentalness  
##          12.4040           -0.5830           -3.8007           -2.2441  
##             key2              key3              key4              key6  
##          -0.2588            0.3053           -0.4241            0.2617  
##            key11          loudness       speechiness           valence  
##          -0.1689            0.4214            2.1000           -1.2328

prediction on test data

# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test0) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test0$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
  RMSE = RMSE(yhat.step_kcv, test0$popularity^0.5),
  R2 = R2(yhat.step_kcv, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.861179 0.1350903

Both of the models have RMSE scores of around 2. This isn't terrible since the range of the transformed popularity scores is 0-10.

Regularized Regression: Ridge

lm.ridge0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge0$bestTune
##    alpha lambda
## 47     0  0.047
# best coefficient
lm.ridge0.model <- coef(lm.ridge0$finalModel, lm.ridge0$bestTune$lambda)
lm.ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      12.3411323068
## duration         -0.5762465119
## acousticness     -0.0662308488
## danceability     -0.1493986374
## energy           -3.5601398635
## instrumentalness -2.2691735701
## key1             -0.0831090967
## key2             -0.3241117452
## key3              0.2260133231
## key4             -0.4931505942
## key5             -0.0698858769
## key6              0.1837187046
## key7              0.0173666066
## key8             -0.1059775161
## key9             -0.1084064701
## key10            -0.1844827951
## key11            -0.2447391761
## loudness          0.3990852347
## speechiness       2.0404769494
## tempo            -0.0007694543
## valence          -1.1748730700
# prediction on test data
yhat.ridge0 = predict(lm.ridge0, s=lm.ridge0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
  RMSE = RMSE(yhat.ridge0, test0$popularity^0.5),
  R2 = R2(yhat.ridge0, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.861093 0.1339601

Regularized Regression: Lasso

lm.lasso0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso0$bestTune
##    alpha lambda
## 17     1  0.017
# best coefficient
lm.lasso0.model <- coef(lm.lasso0$finalModel, lm.lasso0$bestTune$lambda)
lm.lasso0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                            1
## (Intercept)      11.92608192
## duration         -0.55317885
## acousticness      .         
## danceability      .         
## energy           -3.50918126
## instrumentalness -2.05824555
## key1              .         
## key2             -0.16147902
## key3              0.21698994
## key4             -0.36337170
## key5              .         
## key6              0.22354434
## key7              0.03593591
## key8              .         
## key9              .         
## key10            -0.04408683
## key11            -0.12086307
## loudness          0.39934685
## speechiness       1.77557012
## tempo             .         
## valence          -1.14005299
# prediction on test data
yhat.lasso0 = predict(lm.lasso0, s=lm.lasso0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso0 <- yhat.lasso0 - test0$popularity^0.5
rmse.lasso0 <- sqrt(mean(error.lasso0^2))
data.frame(
  RMSE = RMSE(yhat.lasso0, test0$popularity^0.5),
  R2 = R2(yhat.lasso0, test0$popularity^0.5)
)
##       RMSE        R2
## 1 1.858953 0.1351553

Regularized Regression: Elastic Net

lm.enet0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet0$bestTune
# best coefficient
lm.enet0.model <- coef(lm.enet0$finalModel, lm.enet0$bestTune$lambda)
lm.enet0.model
# prediction on test data
yhat.enet0 = predict(lm.enet0, s=lm.enet0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet0 <- yhat.enet0 - test0$popularity^0.5
rmse.enet0 <- sqrt(mean(error.enet0^2))
data.frame(
  RMSE = RMSE(yhat.enet0, test0$popularity^0.5),
  R2 = R2(yhat.enet0, test0$popularity^0.5)
)

For songs with Mode 1

Full Multiple linear model

ml1.sqrt <- lm(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5))
summary(ml1.sqrt)
## 
## Call:
## lm(formula = popularity ~ ., data = train1 %>% mutate(popularity = popularity^0.5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4223 -1.2067  0.1313  1.3255  5.2394 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.857177   0.401243  27.059  < 2e-16 ***
## duration         -0.554933   0.044683 -12.419  < 2e-16 ***
## acousticness      0.080560   0.148271   0.543 0.586927    
## danceability      0.050885   0.254514   0.200 0.841542    
## energy           -3.077791   0.280908 -10.957  < 2e-16 ***
## instrumentalness -1.877299   0.352964  -5.319 1.09e-07 ***
## key1              0.321294   0.091623   3.507 0.000457 ***
## key2              0.207166   0.098073   2.112 0.034699 *  
## key3              0.430613   0.150398   2.863 0.004210 ** 
## key4             -0.029116   0.127272  -0.229 0.819058    
## key5              0.204278   0.111709   1.829 0.067504 .  
## key6              0.216946   0.109298   1.985 0.047206 *  
## key7              0.283270   0.091145   3.108 0.001894 ** 
## key8              0.297098   0.105736   2.810 0.004975 ** 
## key9              0.270884   0.113464   2.387 0.017001 *  
## key10             0.080066   0.132171   0.606 0.544685    
## key11             0.389907   0.117678   3.313 0.000928 ***
## loudness          0.386256   0.017387  22.216  < 2e-16 ***
## speechiness       4.579814   0.414965  11.037  < 2e-16 ***
## tempo            -0.001474   0.000958  -1.538 0.124055    
## valence          -0.849286   0.154553  -5.495 4.08e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.821 on 5483 degrees of freedom
## Multiple R-squared:  0.1499, Adjusted R-squared:  0.1468 
## F-statistic: 48.35 on 20 and 5483 DF,  p-value: < 2.2e-16

prediction on test data

# prediction on test data
yhat.mlr = predict(ml1.sqrt, newdata = test1 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test1$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
  RMSE = RMSE(yhat.mlr, test1$popularity^0.5),
  R2 = R2(yhat.mlr, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858473 0.1257546

Variable Selection: Stepwise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5),  
                 method = "lmStepAIC", trControl = train_control)) 
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection 
## 
## 5504 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 4953, 4954, 4954, 4955, 4953, 4953, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1.823965  0.1452973  1.468792
mlr.step_kcv$finalModel 
## 
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness + 
##     key1 + key2 + key3 + key5 + key6 + key7 + key8 + key9 + key11 + 
##     loudness + speechiness + tempo + valence, data = dat)
## 
## Coefficients:
##      (Intercept)          duration            energy  instrumentalness  
##        10.984803         -0.555451         -3.160519         -1.884462  
##             key1              key2              key3              key5  
##         0.311835          0.198954          0.425083          0.196319  
##             key6              key7              key8              key9  
##         0.209863          0.274552          0.288519          0.261604  
##            key11          loudness       speechiness             tempo  
##         0.383154          0.387173          4.578753         -0.001535  
##          valence  
##        -0.836556

prediction on test data

# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test1) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test1$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
  RMSE = RMSE(yhat.step_kcv, test1$popularity^0.5),
  R2 = R2(yhat.step_kcv, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858518 0.1257011

Regularized Regression: Ridge

lm.ridge1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge1$bestTune
##    alpha lambda
## 45     0  0.045
# best coefficient
lm.ridge1.model <- coef(lm.ridge1$finalModel, lm.ridge1$bestTune$lambda)
lm.ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      10.347839763
## duration         -0.543796248
## acousticness      0.169898922
## danceability      0.049633951
## energy           -2.620990536
## instrumentalness -1.927744817
## key1              0.293618095
## key2              0.182685249
## key3              0.399431145
## key4             -0.049409156
## key5              0.179350934
## key6              0.188771865
## key7              0.259698734
## key8              0.273385099
## key9              0.249069582
## key10             0.051821098
## key11             0.361138772
## loudness          0.355806469
## speechiness       4.325244621
## tempo            -0.001381351
## valence          -0.858954968
# prediction on test data
yhat.ridge1 = predict(lm.ridge1, s=lm.ridge1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
  RMSE = RMSE(yhat.ridge1, test1$popularity^0.5),
  R2 = R2(yhat.ridge1, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858182 0.1253763

Regularized Regression: Lasso

lm.lasso1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso1$bestTune
##   alpha lambda
## 3     1  0.003
# best coefficient
lm.lasso1.model <- coef(lm.lasso1$finalModel, lm.lasso1$bestTune$lambda)
lm.lasso1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      10.823106069
## duration         -0.549904268
## acousticness      0.078797866
## danceability      0.007432375
## energy           -3.012459438
## instrumentalness -1.855427319
## key1              0.268514415
## key2              0.153367880
## key3              0.368124062
## key4             -0.060812240
## key5              0.148197878
## key6              0.160460653
## key7              0.231766196
## key8              0.242516531
## key9              0.215811156
## key10             0.020377237
## key11             0.334220735
## loudness          0.381054835
## speechiness       4.504126435
## tempo            -0.001394241
## valence          -0.829567095
# prediction on test data
yhat.lasso1 = predict(lm.lasso1, s=lm.lasso1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso1 <- yhat.lasso1 - test1$popularity^0.5
rmse.lasso1 <- sqrt(mean(error.lasso1^2))
data.frame(
  RMSE = RMSE(yhat.lasso1, test1$popularity^0.5),
  R2 = R2(yhat.lasso1, test1$popularity^0.5)
)
##       RMSE        R2
## 1 1.858393 0.1255584

Regularized Regression: Elastic Net

lm.enet1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet1$bestTune
# best coefficient
lm.enet1.model <- coef(lm.enet1$finalModel, lm.enet1$bestTune$lambda)
lm.enet1.model
# prediction on test data
yhat.enet1 = predict(lm.enet1, s=lm.enet1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet1 <- yhat.enet1 - test1$popularity^0.5
rmse.enet1 <- sqrt(mean(error.enet1^2))
data.frame(
  RMSE = RMSE(yhat.enet1, test1$popularity^0.5),
  R2 = R2(yhat.enet1, test1$popularity^0.5)
)

Logistic (classification approach)

Use 70% train, 15% validation, 15% test, to use validation for finding optimal cutoff value.

kpop.logit <- select(data, popular, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
logit.kpop0 <- kpop.logit %>% filter(mode == 0)%>% select(-mode)
logit.kpop1 <- kpop.logit %>% filter(mode == 1) %>% select(-mode)

### Kpop mode 0 train and test
# smpl.size0 <- floor(0.75*nrow(logit.kpop0))
# set.seed(123)
# smpl0 <- sample(nrow(logit.kpop0), smpl.size0, replace = FALSE)
# og.logit.train0 <- logit.kpop0[smpl0,]
# og.logit.test0 <- logit.kpop0[-smpl0,]

p3 <- partition.3(logit.kpop0, 0.70, 0.15)
logit.train0 <- p3$data.train
logit.valid0 <- p3$data.val
logit.test0 <- p3$data.test
all.train0 <- rbind(logit.train0, logit.valid0)

# ### Kpop mode 1 train and test
# smpl.size1 <- floor(0.75*nrow(logit.kpop1))
# set.seed(123)
# smpl1 <- sample(nrow(logit.kpop1), smpl.size1, replace = FALSE)
# logit.train1 <- logit.kpop1[smpl1,]
# logit.test1 <- logit.kpop1[-smpl1,]

p3 <- partition.3(logit.kpop1, 0.70, 0.15)
logit.train1 <- p3$data.train
logit.valid1 <- p3$data.val
logit.test1 <- p3$data.test
all.train1 <- rbind(logit.train1, logit.valid1)

Mode 0 popularity

Full Logistic Model: train/valid/test

fitting logistic model using combo of train/valid/test, finding optimal model using training data.

# Fit logistic model on training data
v.logit.model0 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train0)

#search for best cutoff
out0 <- opt.cut.func(v.logit.model0, logit.valid0)
opt.cut.plot(out0)

out0$cutoff[which.min(out0$ssdiff.vec)]
## [1] 0.13
v.opt.cut0 <- out0$cutoff[which.max(out0$kappa.vec)]
v.opt.cut0
## [1] 0.16

Fit final model (combo of train and validation)

v.model.final <-  glm(popular ~ ., data=all.train0, family=binomial(link='logit'))

predict on test using 0.5 cutoff

v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 600  99
##          1   1   1
##                                           
##                Accuracy : 0.8573          
##                  95% CI : (0.8292, 0.8824)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 0.5266          
##                                           
##                   Kappa : 0.0141          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010000        
##             Specificity : 0.998336        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.858369        
##              Prevalence : 0.142653        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504168        
##                                           
##        'Positive' Class : 1               
## 

predict on optimal cutoff (0.16)

v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > v.opt.cut0, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 452  54
##          1 149  46
##                                           
##                Accuracy : 0.7104          
##                  95% CI : (0.6753, 0.7438)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1519          
##                                           
##  Mcnemar's Test P-Value : 4.181e-11       
##                                           
##             Sensitivity : 0.46000         
##             Specificity : 0.75208         
##          Pos Pred Value : 0.23590         
##          Neg Pred Value : 0.89328         
##              Prevalence : 0.14265         
##          Detection Rate : 0.06562         
##    Detection Prevalence : 0.27817         
##       Balanced Accuracy : 0.60604         
##                                           
##        'Positive' Class : 1               
## 
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 359  39
##          1 242  61
##                                           
##                Accuracy : 0.5991          
##                  95% CI : (0.5618, 0.6357)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1123          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.61000         
##             Specificity : 0.59734         
##          Pos Pred Value : 0.20132         
##          Neg Pred Value : 0.90201         
##              Prevalence : 0.14265         
##          Detection Rate : 0.08702         
##    Detection Prevalence : 0.43224         
##       Balanced Accuracy : 0.60367         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: Stepwise (10 fold CV)

step-wise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 

# Fit K-fold CV model  
meh <- capture.output(step0_kcv <- train(popular ~ ., data = logit.train0, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv <- step0_kcv$finalModel
step0_kcv
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness            energy  
##           5.1001           -0.4797           -1.6165           -3.9184  
## instrumentalness              key3              key6          loudness  
##         -12.3907            0.6991            0.3999            0.3236  
##      speechiness           valence  
##           3.2522           -1.5547  
## 
## Degrees of Freedom: 3270 Total (i.e. Null);  3261 Residual
## Null Deviance:       2534 
## Residual Deviance: 2377  AIC: 2397
kcv.prob.test0 <- predict(step0_kcv, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")

Find optimal cut off value:

#search for best cutoff
kcv.out0 <- opt.cut.func(step0_kcv, key.dummy(logit.valid0))
opt.cut.plot(kcv.out0)

kcv.out0$cutoff[which.min(kcv.out0$ssdiff.vec)]
## [1] 0.13
kcv.out0.cut0 <- kcv.out0$cutoff[which.max(kcv.out0$kappa.vec)]
kcv.out0.cut0
## [1] 0.17

Fit final model (combo of train and validation)

finalmeh <- capture.output(step0_kcv.final0 <- train(popular ~ ., data = all.train0, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
step0_kcv.final0 <- step0_kcv.final0$finalModel
step0_kcv.final0
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness            energy  
##           5.1320           -0.5246           -1.6542           -3.7039  
## instrumentalness              key3              key4              key6  
##         -13.3965            0.5700           -0.2640            0.2858  
##         loudness       speechiness           valence  
##           0.3295            3.1544           -1.4994  
## 
## Degrees of Freedom: 3971 Total (i.e. Null);  3961 Residual
## Null Deviance:       3079 
## Residual Deviance: 2888  AIC: 2910

predict on test using 0.5 cutoff

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 600  99
##          1   1   1
##                                           
##                Accuracy : 0.8573          
##                  95% CI : (0.8292, 0.8824)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 0.5266          
##                                           
##                   Kappa : 0.0141          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010000        
##             Specificity : 0.998336        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.858369        
##              Prevalence : 0.142653        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504168        
##                                           
##        'Positive' Class : 1               
## 

predict on test using 0.16 cutoff

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.16, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 451  53
##          1 150  47
##                                           
##                Accuracy : 0.7104          
##                  95% CI : (0.6753, 0.7438)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.157           
##                                           
##  Mcnemar's Test P-Value : 1.607e-11       
##                                           
##             Sensitivity : 0.47000         
##             Specificity : 0.75042         
##          Pos Pred Value : 0.23858         
##          Neg Pred Value : 0.89484         
##              Prevalence : 0.14265         
##          Detection Rate : 0.06705         
##    Detection Prevalence : 0.28103         
##       Balanced Accuracy : 0.61021         
##                                           
##        'Positive' Class : 1               
## 

predict on test using 0.13 cutoff

kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.13, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 362  26
##          1 239  74
##                                          
##                Accuracy : 0.622          
##                  95% CI : (0.5849, 0.658)
##     No Information Rate : 0.8573         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1813         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.7400         
##             Specificity : 0.6023         
##          Pos Pred Value : 0.2364         
##          Neg Pred Value : 0.9330         
##              Prevalence : 0.1427         
##          Detection Rate : 0.1056         
##    Detection Prevalence : 0.4465         
##       Balanced Accuracy : 0.6712         
##                                          
##        'Positive' Class : 1              
## 

Variable Selection: All possible regression

glmulti.out0 <- glmulti(popular ~ ., data = logit.train0,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out0@formulas

view summary of top model

summary(glmulti.out0@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4042  -0.5732  -0.4633  -0.3486   2.4660  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        5.24034    0.74622   7.023 2.18e-12 ***
## duration          -0.47455    0.10297  -4.609 4.05e-06 ***
## acousticness      -1.62079    0.37812  -4.286 1.82e-05 ***
## energy            -4.03171    0.61485  -6.557 5.48e-11 ***
## instrumentalness -12.73308   10.34054  -1.231    0.218    
## loudness           0.32748    0.04685   6.991 2.74e-12 ***
## speechiness        3.16638    0.76128   4.159 3.19e-05 ***
## valence           -1.50323    0.27942  -5.380 7.46e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2534.5  on 3270  degrees of freedom
## Residual deviance: 2389.0  on 3263  degrees of freedom
## AIC: 2405
## 
## Number of Fisher Scoring iterations: 8

Store model

allreg.logit0 <- glmulti.out0@objects[[1]]
#search for best cutoff
allreg.out0 <- opt.cut.func(allreg.logit0, logit.valid0)
opt.cut.plot(allreg.out0)

allreg.out0$cutoff[which.min(allreg.out0$ssdiff.vec)]
## [1] 0.13
allreg.out0.cut0 <- allreg.out0$cutoff[which.max(allreg.out0$kappa.vec)]
allreg.out0.cut0
## [1] 0.17

fit final model to combo of training and validation

glmulti.out0 <- glmulti(popular ~ ., data = all.train0,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas
summary(glmulti.out0@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3796  -0.5760  -0.4646  -0.3454   2.4844  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        5.20379    0.68148   7.636 2.24e-14 ***
## duration          -0.51635    0.09493  -5.439 5.36e-08 ***
## acousticness      -1.66774    0.34909  -4.777 1.78e-06 ***
## energy            -3.81732    0.55999  -6.817 9.31e-12 ***
## instrumentalness -13.79437    9.65872  -1.428    0.153    
## loudness           0.33261    0.04270   7.789 6.73e-15 ***
## speechiness        3.06337    0.68698   4.459 8.23e-06 ***
## valence           -1.42598    0.25451  -5.603 2.11e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3079.5  on 3971  degrees of freedom
## Residual deviance: 2900.1  on 3964  degrees of freedom
## AIC: 2916.1
## 
## Number of Fisher Scoring iterations: 9

store final model

allreg.logit0.final <- glmulti.out0@objects[[1]]

Predictions with 0.5 as the cut off

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 600  99
##          1   1   1
##                                           
##                Accuracy : 0.8573          
##                  95% CI : (0.8292, 0.8824)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 0.5266          
##                                           
##                   Kappa : 0.0141          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010000        
##             Specificity : 0.998336        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.858369        
##              Prevalence : 0.142653        
##          Detection Rate : 0.001427        
##    Detection Prevalence : 0.002853        
##       Balanced Accuracy : 0.504168        
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best kappa

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > allreg.out0.cut0, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 472  59
##          1 129  41
##                                           
##                Accuracy : 0.7318          
##                  95% CI : (0.6974, 0.7643)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1512          
##                                           
##  Mcnemar's Test P-Value : 4.845e-07       
##                                           
##             Sensitivity : 0.41000         
##             Specificity : 0.78536         
##          Pos Pred Value : 0.24118         
##          Neg Pred Value : 0.88889         
##              Prevalence : 0.14265         
##          Detection Rate : 0.05849         
##    Detection Prevalence : 0.24251         
##       Balanced Accuracy : 0.59768         
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best balance of sensitivity and specificity

allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.14, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 383  33
##          1 218  67
##                                           
##                Accuracy : 0.6419          
##                  95% CI : (0.6052, 0.6775)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1735          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.67000         
##             Specificity : 0.63727         
##          Pos Pred Value : 0.23509         
##          Neg Pred Value : 0.92067         
##              Prevalence : 0.14265         
##          Detection Rate : 0.09558         
##    Detection Prevalence : 0.40656         
##       Balanced Accuracy : 0.65364         
##                                           
##        'Positive' Class : 1               
## 

Regularized Regression: Ridge 10 fold Cross Validation

ridge0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
##     alpha lambda
## 100     0    0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       0.1915545704
## duration         -0.2316446189
## acousticness     -0.4391019903
## danceability     -0.0498944180
## energy           -0.6681396288
## instrumentalness -1.0177900019
## key1              0.0754947879
## key2              0.0713988992
## key3              0.3855850943
## key4             -0.1246140988
## key5             -0.0585100854
## key6              0.2136400771
## key7              0.0077267527
## key8              0.1054770688
## key9             -0.0325615911
## key10            -0.0408322593
## key11            -0.0604665387
## loudness          0.0766007935
## speechiness       1.5946326720
## tempo             0.0001130078
## valence          -0.7596590057

Search for best cutoff using validation set

ridge0.out <- reg.opt.cut.func(ridge0, logit.valid0)
opt.cut.plot(ridge0.out)

# cut off by kappa
ridge0.out$cutoff[which.max(ridge0.out$kappa.vec)]
## [1] 0.14
ridge0.out$cutoff[which.min(ridge0.out$ssdiff.vec)]
## [1] 0.13

create final model

ridge0 <- train(popular ~ ., data = all.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
##     alpha lambda
## 100     0    0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       8.530228e-02
## duration         -2.435653e-01
## acousticness     -4.630692e-01
## danceability      7.287529e-02
## energy           -5.554047e-01
## instrumentalness -9.877648e-01
## key1              7.474446e-02
## key2              8.256308e-02
## key3              3.236407e-01
## key4             -1.290358e-01
## key5              3.251688e-02
## key6              1.653136e-01
## key7             -1.463599e-02
## key8              1.009955e-01
## key9             -7.740618e-02
## key10            -2.259250e-02
## key11            -7.828105e-02
## loudness          8.041691e-02
## speechiness       1.589226e+00
## tempo            -6.196997e-05
## valence          -7.257510e-01

predict and evaluate on the test set where cutoff is at 0.5

prob.ridge0 <- predict(ridge0, s = ridge0$bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 601 100
##          1   0   0
##                                           
##                Accuracy : 0.8573          
##                  95% CI : (0.8292, 0.8824)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 0.5266          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8573          
##              Prevalence : 0.1427          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa

prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.12, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 226  20
##          1 375  80
##                                           
##                Accuracy : 0.4365          
##                  95% CI : (0.3994, 0.4741)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.071           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.3760          
##          Pos Pred Value : 0.1758          
##          Neg Pred Value : 0.9187          
##              Prevalence : 0.1427          
##          Detection Rate : 0.1141          
##    Detection Prevalence : 0.6491          
##       Balanced Accuracy : 0.5880          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal sensitivity and specificity balance

prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 407  48
##          1 194  52
##                                         
##                Accuracy : 0.6548        
##                  95% CI : (0.6183, 0.69)
##     No Information Rate : 0.8573        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1226        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.52000       
##             Specificity : 0.67720       
##          Pos Pred Value : 0.21138       
##          Neg Pred Value : 0.89451       
##              Prevalence : 0.14265       
##          Detection Rate : 0.07418       
##    Detection Prevalence : 0.35093       
##       Balanced Accuracy : 0.59860       
##                                         
##        'Positive' Class : 1             
## 

Regularized Regression: Elastic Net 10 fold Cross Validation

enet0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       0.1915545704
## duration         -0.2316446189
## acousticness     -0.4391019903
## danceability     -0.0498944180
## energy           -0.6681396288
## instrumentalness -1.0177900019
## key1              0.0754947879
## key2              0.0713988992
## key3              0.3855850943
## key4             -0.1246140988
## key5             -0.0585100854
## key6              0.2136400771
## key7              0.0077267527
## key8              0.1054770688
## key9             -0.0325615911
## key10            -0.0408322593
## key11            -0.0604665387
## loudness          0.0766007935
## speechiness       1.5946326720
## tempo             0.0001130078
## valence          -0.7596590057

search for best cutoff with validation set

enet0.out <- reg.opt.cut.func(enet0, logit.valid0)
opt.cut.plot(enet0.out)

# cut off by kappa
enet0.out$cutoff[which.max(enet0.out$kappa.vec)]
## [1] 0.14
enet0.out$cutoff[which.min(enet0.out$ssdiff.vec)]
## [1] 0.13

create final model on train and validation

enet0 <- train(popular ~ ., data = all.train0, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)       8.530228e-02
## duration         -2.435653e-01
## acousticness     -4.630692e-01
## danceability      7.287529e-02
## energy           -5.554047e-01
## instrumentalness -9.877648e-01
## key1              7.474446e-02
## key2              8.256308e-02
## key3              3.236407e-01
## key4             -1.290358e-01
## key5              3.251688e-02
## key6              1.653136e-01
## key7             -1.463599e-02
## key8              1.009955e-01
## key9             -7.740618e-02
## key10            -2.259250e-02
## key11            -7.828105e-02
## loudness          8.041691e-02
## speechiness       1.589226e+00
## tempo            -6.196997e-05
## valence          -7.257510e-01

predict and evaluate on the test set where cutoff is at 0.5

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 601 100
##          1   0   0
##                                           
##                Accuracy : 0.8573          
##                  95% CI : (0.8292, 0.8824)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 0.5266          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.8573          
##              Prevalence : 0.1427          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.12, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 226  20
##          1 375  80
##                                           
##                Accuracy : 0.4365          
##                  95% CI : (0.3994, 0.4741)
##     No Information Rate : 0.8573          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.071           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.3760          
##          Pos Pred Value : 0.1758          
##          Neg Pred Value : 0.9187          
##              Prevalence : 0.1427          
##          Detection Rate : 0.1141          
##    Detection Prevalence : 0.6491          
##       Balanced Accuracy : 0.5880          
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal balance of sensitivity and specificity

prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.14, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 407  48
##          1 194  52
##                                         
##                Accuracy : 0.6548        
##                  95% CI : (0.6183, 0.69)
##     No Information Rate : 0.8573        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1226        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.52000       
##             Specificity : 0.67720       
##          Pos Pred Value : 0.21138       
##          Neg Pred Value : 0.89451       
##              Prevalence : 0.14265       
##          Detection Rate : 0.07418       
##    Detection Prevalence : 0.35093       
##       Balanced Accuracy : 0.59860       
##                                         
##        'Positive' Class : 1             
## 

Mode 1 popularity

Full Logistic Model: train/valid/test

fitting logistic model using combo of train/valid/test, finding optimal model using training data.

# Fit logistic model on training data
v.logit.model1 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train1)

#search for best cutoff
out1 <- opt.cut.func(v.logit.model1, logit.valid1)
opt.cut.plot(out1)

out1$cutoff[which.min(out1$ssdiff.vec)]
## [1] 0.11
v.opt.cut1 <- out1$cutoff[which.max(out1$kappa.vec)]
v.opt.cut1
## [1] 0.16

Fit final model (combo of train and validation)

v.model.final1 <-  glm(popular ~ ., data=all.train1, family=binomial(link='logit'))

predict on test using 0.5 cutoff

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 990 109
##          1   1   1
##                                           
##                Accuracy : 0.9001          
##                  95% CI : (0.8808, 0.9172)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 0.5254          
##                                           
##                   Kappa : 0.0143          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0090909       
##             Specificity : 0.9989909       
##          Pos Pred Value : 0.5000000       
##          Neg Pred Value : 0.9008189       
##              Prevalence : 0.0999092       
##          Detection Rate : 0.0009083       
##    Detection Prevalence : 0.0018165       
##       Balanced Accuracy : 0.5040409       
##                                           
##        'Positive' Class : 1               
## 

predict on test using optimal cutoff (kappa)

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > v.opt.cut1, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 800  71
##          1 191  39
##                                           
##                Accuracy : 0.762           
##                  95% CI : (0.7357, 0.7869)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.109           
##                                           
##  Mcnemar's Test P-Value : 1.955e-13       
##                                           
##             Sensitivity : 0.35455         
##             Specificity : 0.80727         
##          Pos Pred Value : 0.16957         
##          Neg Pred Value : 0.91848         
##              Prevalence : 0.09991         
##          Detection Rate : 0.03542         
##    Detection Prevalence : 0.20890         
##       Balanced Accuracy : 0.58091         
##                                           
##        'Positive' Class : 1               
## 

predict on test using optimal cutoff (sensitivity and specificity)

v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 560  39
##          1 431  71
##                                           
##                Accuracy : 0.5731          
##                  95% CI : (0.5433, 0.6026)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0815          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.64545         
##             Specificity : 0.56509         
##          Pos Pred Value : 0.14143         
##          Neg Pred Value : 0.93489         
##              Prevalence : 0.09991         
##          Detection Rate : 0.06449         
##    Detection Prevalence : 0.45595         
##       Balanced Accuracy : 0.60527         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: Stepwise (10 fold CV)

step-wise 10 fold cross validation

set.seed(123)
train_control <- trainControl(method = "cv", 
                              number = 10) 

# Fit K-fold CV model  
meh <- capture.output(step1_kcv <- train(popular ~ ., data = logit.train1, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv <- step1_kcv$finalModel
step1_kcv
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness      danceability  
##           4.1871           -0.4871           -0.7884            0.7627  
##           energy  instrumentalness              key4              key8  
##          -3.7851          -13.4344           -0.4462            0.2437  
##            key10          loudness       speechiness           valence  
##          -0.5490            0.3480            5.4057           -1.4309  
## 
## Degrees of Freedom: 5136 Total (i.e. Null);  5125 Residual
## Null Deviance:       3675 
## Residual Deviance: 3432  AIC: 3456
#search for best cutoff
kcv.out1 <- opt.cut.func(step1_kcv, key.dummy(logit.valid1))
opt.cut.plot(kcv.out1)

kcv.out1$cutoff[which.min(kcv.out1$ssdiff.vec)]
## [1] 0.11
kcv.out1.cut1 <- kcv.out1$cutoff[which.max(kcv.out1$kappa.vec)]
kcv.out1.cut1
## [1] 0.18

Fit final model (combo of train and validation)

finalmeh <- capture.output(step1_kcv.final <- train(popular ~ ., data = all.train1, family = binomial(), 
                 method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv.final <- step1_kcv.final$finalModel
step1_kcv.final
## 
## Call:  NULL
## 
## Coefficients:
##      (Intercept)          duration      acousticness      danceability  
##           4.0904           -0.4978           -0.9406            0.6354  
##           energy  instrumentalness              key3              key4  
##          -3.6817          -13.6231            0.4237           -0.3088  
##             key8             key10          loudness       speechiness  
##           0.2698           -0.6307            0.3306            5.1625  
##          valence  
##          -1.3215  
## 
## Degrees of Freedom: 6237 Total (i.e. Null);  6225 Residual
## Null Deviance:       4372 
## Residual Deviance: 4096  AIC: 4122

predict on test using 0.5 cutoff

kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 990 110
##          1   1   0
##                                           
##                Accuracy : 0.8992          
##                  95% CI : (0.8799, 0.9163)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 0.5651          
##                                           
##                   Kappa : -0.0018         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000000       
##             Specificity : 0.9989909       
##          Pos Pred Value : 0.0000000       
##          Neg Pred Value : 0.9000000       
##              Prevalence : 0.0999092       
##          Detection Rate : 0.0000000       
##    Detection Prevalence : 0.0009083       
##       Balanced Accuracy : 0.4994955       
##                                           
##        'Positive' Class : 1               
## 
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > kcv.out1.cut1, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 871  82
##          1 120  28
##                                          
##                Accuracy : 0.8165         
##                  95% CI : (0.7924, 0.839)
##     No Information Rate : 0.9001         
##     P-Value [Acc > NIR] : 1.000000       
##                                          
##                   Kappa : 0.1157         
##                                          
##  Mcnemar's Test P-Value : 0.009233       
##                                          
##             Sensitivity : 0.25455        
##             Specificity : 0.87891        
##          Pos Pred Value : 0.18919        
##          Neg Pred Value : 0.91396        
##              Prevalence : 0.09991        
##          Detection Rate : 0.02543        
##    Detection Prevalence : 0.13442        
##       Balanced Accuracy : 0.56673        
##                                          
##        'Positive' Class : 1              
## 
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.11, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 577  40
##          1 414  70
##                                           
##                Accuracy : 0.5876          
##                  95% CI : (0.5579, 0.6169)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.087           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.63636         
##             Specificity : 0.58224         
##          Pos Pred Value : 0.14463         
##          Neg Pred Value : 0.93517         
##              Prevalence : 0.09991         
##          Detection Rate : 0.06358         
##    Detection Prevalence : 0.43960         
##       Balanced Accuracy : 0.60930         
##                                           
##        'Positive' Class : 1               
## 

Variable Selection: All possible regression

glmulti.out1 <- glmulti(popular ~ ., data = logit.train1,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas

view summary of top model

summary(glmulti.out1@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2927  -0.5336  -0.4321  -0.3262   2.7541  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.10815    0.68233   6.021 1.74e-09 ***
## duration          -0.48381    0.08923  -5.422 5.89e-08 ***
## acousticness      -0.82521    0.27036  -3.052  0.00227 ** 
## danceability       0.82523    0.42883   1.924  0.05431 .  
## energy            -3.78352    0.51679  -7.321 2.46e-13 ***
## instrumentalness -13.40830    7.93108  -1.691  0.09091 .  
## loudness           0.34743    0.03805   9.130  < 2e-16 ***
## speechiness        5.44074    0.61682   8.821  < 2e-16 ***
## valence           -1.40523    0.27010  -5.203 1.97e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3675.4  on 5136  degrees of freedom
## Residual deviance: 3442.9  on 5128  degrees of freedom
## AIC: 3460.9
## 
## Number of Fisher Scoring iterations: 9

Store model

allreg.logit1 <- glmulti.out1@objects[[1]]
#search for best cutoff
allreg.out1 <- opt.cut.func(allreg.logit1, logit.valid1)
opt.cut.plot(allreg.out1)

allreg.out1$cutoff[which.min(allreg.out1$ssdiff.vec)]
## [1] 0.11
allreg.out1.cut1 <- allreg.out1$cutoff[which.max(allreg.out1$kappa.vec)]
allreg.out1.cut1
## [1] 0.16

fit final model to combo of training and validation

glmulti.out1 <- glmulti(popular ~ ., data = all.train1,
                           level = 1,               # No interaction considered
            method = "h",            # Exhaustive approach
            crit = "aic",            # AIC as criteria
            confsetsize = 5,         # Keep 5 best models
            plotty = F, report = F,  # No plot or interim reports
            fitfunction = "glm",     # lm function
            family = binomial)       # binomial to run logistic regression 
#glmulti.out1@formulas
summary(glmulti.out1@objects[[1]])
## 
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9085  -0.5249  -0.4297  -0.3287   2.7358  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.06861    0.62756   6.483 8.98e-11 ***
## duration          -0.49333    0.08221  -6.001 1.96e-09 ***
## acousticness      -0.94155    0.24944  -3.775  0.00016 ***
## danceability       0.63868    0.39311   1.625  0.10423    
## energy            -3.68922    0.47529  -7.762 8.36e-15 ***
## instrumentalness -13.48141    7.35928  -1.832  0.06697 .  
## loudness           0.32993    0.03477   9.488  < 2e-16 ***
## speechiness        5.19880    0.55978   9.287  < 2e-16 ***
## valence           -1.30087    0.24869  -5.231 1.69e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4372.3  on 6237  degrees of freedom
## Residual deviance: 4113.3  on 6229  degrees of freedom
## AIC: 4131.3
## 
## Number of Fisher Scoring iterations: 9

store final model

allreg.logit1.final <- glmulti.out1@objects[[1]]

Predictions with 0.5 as the cut off

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 990 109
##          1   1   1
##                                           
##                Accuracy : 0.9001          
##                  95% CI : (0.8808, 0.9172)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 0.5254          
##                                           
##                   Kappa : 0.0143          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0090909       
##             Specificity : 0.9989909       
##          Pos Pred Value : 0.5000000       
##          Neg Pred Value : 0.9008189       
##              Prevalence : 0.0999092       
##          Detection Rate : 0.0009083       
##    Detection Prevalence : 0.0018165       
##       Balanced Accuracy : 0.5040409       
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best kappa

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > allreg.out1.cut1, 1, 0) # using optimal cutoff 0.17
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 821  75
##          1 170  35
##                                           
##                Accuracy : 0.7775          
##                  95% CI : (0.7517, 0.8017)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.106           
##                                           
##  Mcnemar's Test P-Value : 1.908e-09       
##                                           
##             Sensitivity : 0.31818         
##             Specificity : 0.82846         
##          Pos Pred Value : 0.17073         
##          Neg Pred Value : 0.91629         
##              Prevalence : 0.09991         
##          Detection Rate : 0.03179         
##    Detection Prevalence : 0.18619         
##       Balanced Accuracy : 0.57332         
##                                           
##        'Positive' Class : 1               
## 

Predictions where cut off is the best balance of sensitivity and specificity

allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.11, 1, 0) # using optimal cutoff 
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 585  41
##          1 406  69
##                                           
##                Accuracy : 0.594           
##                  95% CI : (0.5643, 0.6232)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0879          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.62727         
##             Specificity : 0.59031         
##          Pos Pred Value : 0.14526         
##          Neg Pred Value : 0.93450         
##              Prevalence : 0.09991         
##          Detection Rate : 0.06267         
##    Detection Prevalence : 0.43143         
##       Balanced Accuracy : 0.60879         
##                                           
##        'Positive' Class : 1               
## 

Regularized Regression: Ridge 10 fold Cross Validation

ridge1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
##     alpha lambda
## 100     0    0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      -0.7794612024
## duration         -0.2103504081
## acousticness     -0.1043006389
## danceability      0.1845311234
## energy           -0.3424994323
## instrumentalness -0.8360208822
## key1             -0.0052137727
## key2              0.0277371298
## key3              0.1656719284
## key4             -0.1672407313
## key5              0.0652609329
## key6             -0.0837961594
## key7              0.0624070766
## key8              0.1531177410
## key9              0.1399246098
## key10            -0.2261685699
## key11             0.0117086597
## loudness          0.0667931811
## speechiness       2.5045118512
## tempo             0.0005094302
## valence          -0.5176594535

Search for best cutoff using validation set

ridge1.out <- reg.opt.cut.func(ridge1, logit.valid1)
opt.cut.plot(ridge1.out)

# cut off by kappa
ridge1.out$cutoff[which.max(ridge1.out$kappa.vec)]
## [1] 0.15
ridge1.out$cutoff[which.min(ridge1.out$ssdiff.vec)]
## [1] 0.12

create final model

ridge1 <- train(popular ~ ., data = all.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
##     alpha lambda
## 100     0    0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                             1
## (Intercept)      -0.895075690
## duration         -0.210354815
## acousticness     -0.142604779
## danceability      0.144718060
## energy           -0.323758232
## instrumentalness -0.838304065
## key1              0.059858865
## key2              0.012365812
## key3              0.208277667
## key4             -0.125410825
## key5              0.041362313
## key6             -0.023137874
## key7              0.065428774
## key8              0.153045769
## key9              0.084576094
## key10            -0.256890886
## key11            -0.017272069
## loudness          0.062665247
## speechiness       2.357820019
## tempo             0.001015292
## valence          -0.474750339

predict and evaluate on the test set where cutoff is at 0.5

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 991 110
##          1   0   0
##                                           
##                Accuracy : 0.9001          
##                  95% CI : (0.8808, 0.9172)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 0.5254          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.90009         
##              Prevalence : 0.09991         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.13 corresponding to optimal kappa

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 829  78
##          1 162  32
##                                           
##                Accuracy : 0.782           
##                  95% CI : (0.7564, 0.8061)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0951          
##                                           
##  Mcnemar's Test P-Value : 8.432e-08       
##                                           
##             Sensitivity : 0.29091         
##             Specificity : 0.83653         
##          Pos Pred Value : 0.16495         
##          Neg Pred Value : 0.91400         
##              Prevalence : 0.09991         
##          Detection Rate : 0.02906         
##    Detection Prevalence : 0.17620         
##       Balanced Accuracy : 0.56372         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity

prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 533  32
##          1 458  78
##                                          
##                Accuracy : 0.555          
##                  95% CI : (0.525, 0.5846)
##     No Information Rate : 0.9001         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0907         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.70909        
##             Specificity : 0.53784        
##          Pos Pred Value : 0.14552        
##          Neg Pred Value : 0.94336        
##              Prevalence : 0.09991        
##          Detection Rate : 0.07084        
##    Detection Prevalence : 0.48683        
##       Balanced Accuracy : 0.62347        
##                                          
##        'Positive' Class : 1              
## 

Regularized Regression: Lasso 10 fold Cross Validation (not kosher, returns with a model of just the intercept)

lasso1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lasso1$bestTune
# best coefficient
lasso1.model <- coef(lasso1$finalModel, lasso1$bestTune$lambda)
lasso1.model

Search for best cutoff using validation set

lasso1.out <- reg.opt.cut.func(lasso1, logit.valid1)
opt.cut.plot(lasso1.out)
# cut off by kappa
lasso1.out$cutoff[which.max(lasso1.out$kappa.vec)]
lasso1.out$cutoff[which.min(lasso1.out$ssdiff.vec)]

Regularized Regression: Elastic Net 10 fold Cross Validation

enet1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
##     alpha lambda
## 100     0    0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      -0.7794612024
## duration         -0.2103504081
## acousticness     -0.1043006389
## danceability      0.1845311234
## energy           -0.3424994323
## instrumentalness -0.8360208822
## key1             -0.0052137727
## key2              0.0277371298
## key3              0.1656719284
## key4             -0.1672407313
## key5              0.0652609329
## key6             -0.0837961594
## key7              0.0624070766
## key8              0.1531177410
## key9              0.1399246098
## key10            -0.2261685699
## key11             0.0117086597
## loudness          0.0667931811
## speechiness       2.5045118512
## tempo             0.0005094302
## valence          -0.5176594535

search for best cutoff with validation set

enet1.out <- reg.opt.cut.func(enet1, logit.valid1)
opt.cut.plot(enet1.out)

# cut off by kappa
enet1.out$cutoff[which.max(enet1.out$kappa.vec)]
## [1] 0.15
enet1.out$cutoff[which.min(enet1.out$ssdiff.vec)]
## [1] 0.12

create final model on train and validation

enet1 <- train(popular ~ ., data = all.train1, method = "glmnet",
                      family = "binomial", trControl = train_control, 
                      tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
##     alpha lambda
## 200  0.05    0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                              1
## (Intercept)      -1.1792935147
## duration         -0.1699469465
## acousticness     -0.0051000448
## danceability      .           
## energy           -0.0466349208
## instrumentalness -0.4146454458
## key1              .           
## key2              .           
## key3              0.0528461274
## key4             -0.0243407299
## key5              .           
## key6              .           
## key7              .           
## key8              0.0564175836
## key9              .           
## key10            -0.1479976169
## key11             .           
## loudness          0.0475991270
## speechiness       2.0833703322
## tempo             0.0001026083
## valence          -0.3234255985

predict and evaluate on the test set where cutoff is at 0.5

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 991 110
##          1   0   0
##                                           
##                Accuracy : 0.9001          
##                  95% CI : (0.8808, 0.9172)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 0.5254          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.90009         
##              Prevalence : 0.09991         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.13 corresponding to optimal kappa

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 870  85
##          1 121  25
##                                           
##                Accuracy : 0.8129          
##                  95% CI : (0.7886, 0.8355)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.0918          
##                                           
##  Mcnemar's Test P-Value : 0.01475         
##                                           
##             Sensitivity : 0.22727         
##             Specificity : 0.87790         
##          Pos Pred Value : 0.17123         
##          Neg Pred Value : 0.91099         
##              Prevalence : 0.09991         
##          Detection Rate : 0.02271         
##    Detection Prevalence : 0.13261         
##       Balanced Accuracy : 0.55259         
##                                           
##        'Positive' Class : 1               
## 

predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity

prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 511  31
##          1 480  79
##                                           
##                Accuracy : 0.5359          
##                  95% CI : (0.5059, 0.5657)
##     No Information Rate : 0.9001          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0831          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.71818         
##             Specificity : 0.51564         
##          Pos Pred Value : 0.14132         
##          Neg Pred Value : 0.94280         
##              Prevalence : 0.09991         
##          Detection Rate : 0.07175         
##    Detection Prevalence : 0.50772         
##       Balanced Accuracy : 0.61691         
##                                           
##        'Positive' Class : 1               
##